# Copyright 2013 Al Cramer
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import math

"""
script for creating xml version of roget. The source
text ("roget11raw.txt") has some garbage content that trips up
the parse. These were edited out by hand to yield "roget11.txt",
which is the starting souce for these scripts. The warnings
issued by various passes were examined and a few additional
tweaks were made to "roget11.txt".

The final output file generated by this script is "roget.xml".
Various intermediary files are also created. You need to manually
delete them. This is by design: by diff'ing and inspecting them,
you can verify the tranforms work as intended.
"""

# global linenumber
lno = 0

# pos names
validPos = ['N','V','Adj','Adv','Phr','Int']

# err shutdown
def err(msg):
    print "err line %s" % lno
    print msg
    exit(1)
    
def warn(msg):
    print "warn line %s" % lno
    print msg

# simple parser helpers
def skip(i,li,skipSet):
    while i<len(li) and li[i] in skipSet:
        i += 1
    return i
    
def findCloser(i,li):
    if li[i] == '[':
        closer = ']'
    elif li[i] == '(':
        closer = ')'
    elif li[i] == '{':
        closer = '}'
    else:
        # an error
        return -1
    i += 1
    while i < len(li) and li[i] != closer:
        if li[i] in ['[','(','{']:
            i = findCloser(i,li)
            if i == -1:
                return -1
        i += 1
    if i == len(li):
        return -1
    return i

def parseId(i,li):
    if i>= len(li) or not li[i].isalnum():
        return -1
    while i+1 < len(li) and li[i+1].isalnum():
        i += 1
    return i

# split input line on char's in char-set "delims"
def splitLine(li,delims):
    li = li.strip()
    if len(li) == 0:
        return []
    # bracketted text can contain the split chars, so we have to
    # parse it as unitary chunks. We find all split chars outside
    # these chunks, replace with "\n", then split on \n
    tmp = []
    i = 0
    while i < len(li):
        if li[i] in ['[','(','{']:
            j = findCloser(i,li)
            if j == -1:
                warn("splitLine rejected line: %s" % li)
                return []
            blk = li[i:j+1]
            tmp.append(blk)
            i = j + 1
        else:
            if li[i] in delims:
                tmp.append('\n')
            else:
                tmp.append(li[i])
            i += 1
    tmp = ''.join(tmp)
    lst = []
    for term in tmp.split('\n'):
        term = term.strip()
        if len(term) > 0:
            lst.append(term)
    return lst

# bad lines, removed in pass1
badlinesSrc = \
    """
4. MODAL EXISTENCE
Absolute
SECTION II. RELATION
. 1. ABSOLUTE RELATION
3. CONJUNCTIVE QUANTITY
4. CONCRETE QUANTITY
SECTION IV. ORDER
1. ORDER
2. CONSECUTIVE ORDER
3. COLLECTIVE ORDER
4. DISTRIBUTIVE ORDER
5. ORDER AS REGARDS CATEGORIES
SECTION V. NUMBER
1. NUMBER, IN THE ABSTRACT
2.  CONNECTION BETWEEN CAUSE AND EFFECT
4.  Indirect Power
5.  Combinations of Causes
2. RELATIVE SPACE
3. EXISTENCE IN SPACE
3.  CENTRICAL DIMENSIONS
3. IMPERFECT FLUIDS
SECTION VII.  CREATIVE THOUGHT
DIVISION (II) COMMUNICATION OF IDEAS
SECTION I. NATURE OF IDEAS COMMUNICATED.
SECTION II.  MODES OF COMMUNICATION
Various Qualities of Style
1. Actual Subservience
2. Transfer of Property
3. Interchange of Property
4. MORAL PRACTICE
12-13-90 -->
    """
    
# unescape any html encodings for <,>, and &
# canonicalize various constructs
# kill "<..>" and "%..%"
def pass1():
    # cleanup and canonicalize
    fin = open('roget11.txt','r')
    fout = open('tmp.txt','w')
    lno = 0
    for li in fin:
        lno += 1
        if lno % 1000 == 0:
            print "pass1a line %s"%lno
        # unescape html encodings
        li = li.replace('&amp;','&')
        li = li.replace('&gt;','>')
        li = li.replace('&lt;','<')
        # canonicalize "{ant" and "{opp"
        li = li.replace('{ant.','{ant ')
        li = li.replace('{opp.','{ant ')
        li = li.replace('{ant to','{ant ')
        li = li.replace('{ant from','{ant ')
        # canonicalize "&c" and kill useless cases
        li = li.replace('&c.','&c')
        li = re.sub(r';\s*\&c;', ';', li)
        li = li.replace('&c;',';')
        li = li.replace('&cv.;',';')
        # kill "<...>" within a line
        S = li.find('<')
        E = li.find('>')
        if S != -1 and E != -1 and E>S:
            lix = '%s %s' % (li[0:S],li[E+1:])
            if len(lix.strip()) > 0:
                fout.write(lix)
            else:
                fout.write('\n')
            continue
        # kill lines that start and end with '%'
        test = li.strip()
        if test.startswith('%') and \
            test.endswith('%') \
            and len(test) > 1:
            fout.write('\n')
            continue
        fout.write(li)
    fin.close()
    fout.close()
    # get rid of multiline <..> constructs
    fin = open('tmp.txt','r')
    fout = open('tmp1.txt','w')
    lno = 0
    exclude = False
    for li in fin:
        lno += 1
        if lno % 1000 == 0:
            print "pass 1b line %s"%lno
        test = li.strip()
        if test.startswith('<'):
            excludeS = lno
            exclude = True
            fout.write('\n')    
            continue
        if exclude:
            if lno - excludeS > 10:
                err('pass1b. excludeS %d' % excludeS)
            if test.endswith('>'):
                exclude = False
            continue
        fout.write(li)
    fin.close()
    fout.close()
    # get rid of multiline %..% constructs
    fin = open('tmp1.txt','r')
    fout = open('tmp2.txt','w')
    lno = 0
    exclude = False
    for li in fin:
        lno += 1
        if lno % 1000 == 0:
            print "pass 1c line %s"%lno
        test = li.strip()
        if test.startswith('%'):
            if exclude:
                exclude = False
            else:
                exclude = True
                excludeS = lno
            fout.write('\n')
            continue
        if exclude:
            if lno - excludeS > 10:
                err('pass1c. excludeS %d' % excludeS)
            continue
        fout.write(li)
    fin.close()
    fout.close()
    # get rid of known bad lines
    print "excluding known bad lines..."
    badlines = []
    for li in badlinesSrc.split('\n'):
        li = li.strip()
        if len(li) > 0:
            badlines.append(li)
    fin = open('tmp2.txt','r')
    fout = open('x1.txt','w')
    lno = 0
    for li in fin:
        lno += 1
        test = li.strip()
        if test in badlines:
            print "excluded %s" %li
            fout.write('\n')
            continue
        fout.write(li)
    fin.close()
    fout.close()

# add missing indents for pos section headers, then
# report suspect lines so they can be edited out by hand.
def pass2():
    global lno
    fin = open('x1.txt','r')
    fout = open('x2.txt','w')
    lno = 0
    exclude = False
    for li in fin:
        lno += 1
        li = li.rstrip()
        if len(li) > 0:
            i = skip(0,li,[' '])
            nWhite = i
            idE = parseId(i,li)
            if idE != -1:
                tok = li[i:idE+1]
                if tok in validPos and nWhite != 5:
                    print "reindent lno %s: %s\n" % (lno,li)
                    fout.write("     ")
                    fout.write(li[i:])
                    fout.write('\n')
                    continue
        fout.write(li)
        fout.write('\n')
    fin.close()
    fout.close()
    # reopen output, find suspect lines
    print "Finding suspect lines..."
    fin = open('x2.txt','r')
    lno = 0
    inSection = False
    for li in fin:
        lno += 1
        li = li.rstrip()
        if len(li) == 0:
            inSection = False
            continue
        nWhite = skip(0,li,[' '])
        if nWhite == 5:
            inSection = True
            continue
        if nWhite == 0:
            if inSection:
                continue
        print 'suspect line %s: %s\n' % (lno,li)
    fin.close()

# ver indentation: 0 or 5
def vetIndent(fn):
    print "Vetting indententation ..."
    fin = open(fn,'r')
    lno = 0
    for li in fin:
        lno += 1
        li = li.rstrip()
        nWhite = skip(0,li,[' '])
        if nWhite == 5 or nWhite == 0:
            continue
        print 'suspect line %s: %s\n' % (lno,li)
    print "vetIndent fini"
    
# adjust off-by-1 indices       
def pass3():
    global lno
    fin = open('x2.txt','r')
    fout = open('x3.txt','w')
    lno = 0
    exclude = False
    for li in fin:
        lno += 1
        li = li.rstrip()
        if len(li) > 0:
            i = skip(0,li,[' '])
            nWhite = i
            if nWhite == 1:
                print "reindent %s: %s\n" % (lno,li)
                fout.write(li[i:])
                fout.write('\n')
                continue
            if nWhite == 6:
                print "reindent %s: %s\n" % (lno,li)
                fout.write("     ")
                fout.write(li[i:])
                fout.write('\n')
                continue
        fout.write(li)
        fout.write('\n')
    fin.close()
    fout.close()
    # reopen output, find suspect lines
    vetIndent('x3.txt')
    print 'pass3 fini'

# join continuation lines. If line_i is indented, and i+1
# is not, then i+1 is appended to i.
def pass4():
    global lno
    fin = open('x3.txt','r')
    fout = open('x4.txt','w')
    lno = 0
    curBlk = []
    for li in fin:
        lno += 1
        test = li.strip()
        if len(test) == 0:
            continue
        li = li.rstrip()
        i = skip(0,li,[' '])
        nWhite = i
        if not (nWhite==0 or nWhite==5):
            err("pass5 bad indent: %s\n" % li)
        if nWhite == 5:
            # a new block
            if len(curBlk) > 0:
                fout.write(' '.join(curBlk))
                fout.write('\n')
            curBlk = [li[i:]]
            continue
        assert len(curBlk) > 0
        curBlk.append(li[i:])
    # catch last block
    if len(curBlk) > 0:
        fout.write(' '.join(curBlk))
        fout.write('\n')
    fin.close()
    fout.close()
    vetIndent('x4.txt')
    print 'pass4 fini'
    
def parseHead(li):
    idE = parseId(1,li)
    if idE == -1:
        err("parseHead err: " % li)
    sectionId = li[1:idE+1]
    i = skip(idE+1,li,['.',' '])
    wrd = []
    notes = []
    dst = wrd
    while i < len(li):
        if i+1<len(li) and li[i]=='N' and \
            (li[i+1]=='.' or li[i+1]==' '):
            i -= 1
            break
        if i+1<len(li) and li[i]=='-' and \
            li[i+1]=='-':
            break
        if li[i] == '[':
            dst = notes
        elif li[i] == ']':
            dst.append(' ')
            dst = wrd
        elif li[i]=='.' or li[i]=='-':
            dst.append(' ')
        else:
            dst.append(li[i])
        i += 1
    wrd = ''.join(wrd).strip()
    notes = ''.join(notes).strip()
    i = skip(i,li,['-','.',' '])
    if len(sectionId)==0 or len(wrd)==0:
        err("parseHead: %s\n" % li)
    sectDecl = '!startSection:%s:%s:%s' % \
        (sectionId,wrd,notes)
    return [i,sectDecl]

# parse headers. This creates "!startSection:..." and "!pos" constructs
def pass5():
    global lno
    fin = open('x4.txt','r')
    fout = open('tmp.txt','w')
    lno = 0
    for li in fin:
        lno += 1
        li = li.rstrip()
        if len(li)>0 and li[0]=='#':
            [i,sectDecl] = parseHead(li)
            fout.write(sectDecl+ '\n')
            if i<len(li):
                posE = parseId(i,li)
                tok = li[i:posE+1]
                if tok in validPos:
                    fout.write(li[i:] + '\n')
                else:
                    err("pass 5 bad pos(?): %s\n" % li)
            continue
        fout.write(li + '\n')
    fin.close()
    fout.close()
    # reopen output file: validate line after !sectionStart starts with pos
    fin = open('tmp.txt','r')
    lno = 0
    while True:
        li = fin.readline()
        lno += 1
        if len(li) == 0:
            break
        if li.startswith('!startSection'):
            li = fin.readline()
            lno += 1
            idE = parseId(0,li)
            if idE == -1:
                err("pass5b err:%s" % li)
            tok = li[0:idE+1]
            if not tok in validPos:
                err("pass5b err:%s" % li)
            continue
    fin.close()
    # create "!pos:XXX" constructs
    fin = open('tmp.txt','r')
    fout = open('tmp1.txt','w')
    lno = 0
    for li in fin:
        lno += 1
        li = li.rstrip()
        if len(li)>0:
            tokE = parseId(0,li)
            if tokE != -1:
                tok = li[0:tokE+1]
                if tok in validPos:
                    fout.write('!pos:%s\n'%tok)
                    i = tokE+1
                    if li[i] == '.':
                        i += 1
                    else:
                        warn('pass5c err:%s\n'%li)
                    remainder = li[i:].strip()
                    if len(remainder)>0:
                        fout.write(remainder + '\n')
                    continue
        fout.write(li+'\n')
    fin.close()
    fout.close()
    # create final version for this pass, canonicalizing spaces
    fin = open('tmp1.txt','r')
    fout = open('x5.txt','w')
    lno = 0
    for li in fin:
        lno += 1
        li = li.rstrip()
        li = re.sub(r'\s+', ' ', li)             
        li = li.replace('( ' , '(')
        li = li.replace(', )' , ')')
        li = li.replace(',)' , ')')
        li = li.replace(' )' , ')')
        li = li.replace('] ,' , '], ')
        li = li.replace('&c' , '&c ')
        li = li.replace('- ' , '-')
        li = li.replace('[Fr.]' , '[Fr]')
        li = li.replace('[It.]' , '[It]')
        li = li.replace('[Lat.]' , '[Lat]')
        li = li.replace('[Gr.]' , '[Gr]')
        li = re.sub(r'\s+',' ', li)
        if len(li)>0:
            fout.write(li+'\n')
    fin.close()
    fout.close()
    print 'pass5 fini'
    
# parse a term in a sense: canonicalize xref's. Then append to "terms" 
def parseTerm(terms,src):
    raw = src
    # kill internal ref's
    src = src.strip()
    intRefs = [
        '&c n',
        '&c v',
        '&c adj',
        '&cadj',
        '&c adv',
        '&cn',
        '&cv']
    for ir in intRefs:
        src = src.replace(ir,'')
    src = re.sub(r'\s+',' ',src)
##    # canonicalize dashes: "be -called", "pear-shaped"
##    src = src.replace(' -',' ')
##    src = src.replace('-',' ')
    # canonicalize dashes: "be -called"
    src = src.replace(' -',' ')

    # catch ref's of form: "(happy) 147"
    # (Generally these are preceeded by "&c"; but not always)
    pat = r'\(([^\)]+)\) (\d+\w*)'
    while True:
        m = re.search(pat,src)
        if m is None:
            break
        keyword = m.groups()[0]
        sect = m.groups()[1]
        repl = '#%s^%s' % (sect,keyword.replace(' ','_'))
        targ = '(%s) %s' % (keyword,sect)
        src = src.replace(targ,repl)

    # also these forms:
    # "let fall &c 308"
    # "spectral &c 980"
##    if src == 'let fall &c 308':
##        debug = 1
    pat = r'([\w\s-]+) &c (\d+\w*)'
    while True:
        m = re.search(pat,src)
        if m is None:
            break
        keyword = m.groups()[0].strip()
        if len(keyword)==0:
            break
        sect = m.groups()[1]
        repl = '#%s^%s' % (sect,keyword.replace(' ','_'))
        targ = '%s &c %s' % (keyword,sect)
        src = src.replace(targ,repl)

    # also this form: "location 184"
    pat = r'([\w]+) (\d+\w*)'
    while True:
        m = re.search(pat,src)
        if m is None:
            break
        keyword = m.groups()[0]
        sect = m.groups()[1]
        repl = '#%s^%s' % (sect,keyword)
        targ = '%s %s' % (keyword,sect)
        src = src.replace(targ,repl)


    # remove "&c" and "c" by itself (a variant)
    src = src.replace('&c','')
    src = src.replace(' c ','')

    # excise bracketed material
    src = re.sub(r'\[[^\]]*\]','',src)
    src = re.sub(r'\([^\)]*\)','',src)
    src = re.sub(r'\{[^\}]*\}','',src)
    
    src = src.strip()
    src = re.sub(r'\s+',' ',src)

    # excise some spurious patterns
    if src.endswith('Adv') or src.endswith('Adj'):
        src = src[:-3]
    if src.endswith('N') or src.endswith('V'):
        src = src[:-1]

    # reject junk cases
    if re.search(r'[\(\)\[\]\{\}:<>&"]',src):    
        print('parseTerm rejected:%s' % raw)
        return
    # reject non-standard ascii chars (these are errors
    # caused by dirty source text)
    for i in range(len(src)):
        v = ord(src[i])
        if v<32 or v>126:
            print('parseTerm rejected:%s' % raw)
            return

    # require at least 1 word char
    if len(src) >0:
        if not re.search(r'[a-zA-Z]',src):
            print('parseTerm rejected:%s' % raw)
            return

    # rare cases give us screwed up cross-refs: should be at most 1
    if src.count('#') > 1:
        print('parseTerm rejected messed up xref:%s' % raw)
        return
    
    # reject odd little cases that emerge from errors
    reject = ['in','from','at','by','on','for','the',\
        'a','of','be','not']
    if src in reject or len(src)==0:
        return

    # more odd cases: end with an article
    if src.endswith(' the') or src.endswith(' a') or \
        src.endswith(' an'):
        return
    terms.append(src)

# subgroups containing these snippets are killed
killSubgrpPhr = ['cease to','have no']

# terms containing these snippets are killed
killTermPhr = ['[U','obs3','"','*','|',
           '[Fr]','[It]','[Gr]','[Grk]','[Lat]']
           
# parse a subgroup  -- a comma seperated list of synonyms. 
def parseSubgrp(src):
    if len(src) == 0:
        return None
    terms = []
    # if we open with "[xxx]", that's the head
    if src[0] == '[':
        i = findCloser(0,src)
        if i == -1 or i+1 >= len(src):
            print "parseSense excluding src (unclosed head): " + src
            return None
        _src = src[i+1:].strip()
        if _src[0] == ',':
            _src = _src[1:].strip()
        if len(_src) == 0:
            print "parseSense excluding src (head error?): " + src
            return None
        src = _src
            
    # these phrase with the subgrp causes semantic inaccuracies
    for phr in killSubgrpPhr:
        if src.find(phr) != -1:
            print "parseSense excluding src (killSubgrpPhr): " + src
            return None
    
    for t in splitLine(src,[',']):
        t = t.strip()
        if len(t)==0:
            continue
        exclude = False
        for phr in killTermPhr:
            if t.find(phr) != -1:
                exclude = True
                break
        if not exclude:
            parseTerm(terms,t)
    return None if len(terms)==0 else ', '.join(terms)
    
# parse a section: "lines" is src for a section
def parseSect(fout,lines):
    if len(lines) == 0:
        return
    li = lines[0]
    if not li.startswith('!startSection:'):
        err("parseSect:%s" % li)
    curSectId = li.split(':')[1]
    fout.write(li + '\n')
    # dictionary of groups by part-of-speach
    posGrps = {}
    for pos in validPos:
        posGrps[pos] = []
    i = 1
    while i<len(lines):
        li = lines[i]
        if not li.startswith('!pos:'):
            print 'parseSect. unexpected line:%s' % li
            print 'src for section:\n'
            print '\n'.join(lines)
            exit(2)
        pos = li.split(':')[1]
        i += 1
        # each line (up to the next !pos directive) is a
        # group for this pos. So we're building up a collection
        # of groups.
        while i < len(lines):
            if lines[i].startswith('!pos:'):
                break
            posGrps[pos].append(lines[i])
            i += 1
            
    for pos in validPos:
        if pos == 'Int' or pos == 'Phr':
            # these sections are of no interest: skip
            continue
        # each grp is comprised of a set of subgroups seperated by
        # semi-colons and/or periods. A subgroup is either a single
        # term, or a comma-seperated synonyms list
        grpEnum = 1
        for grpSrc in posGrps[pos]:
            subgrps = []
            for subgrpSrc in splitLine(grpSrc,[';','.']):
                subgrp = parseSubgrp(subgrpSrc)
                if subgrp != None:
                    subgrps.append(subgrp)
            if len(subgrps) > 0:
                grpId = '%s.%s.%s' %\
                    (curSectId,pos,grpEnum)
                fout.write('%s:%s\n' % \
                    (grpId,'; '.join(subgrps)))
                grpEnum += 1
    fout.write('!endSection:%s\n' % curSectId)
    

# parse sections, creating groups (actually sequences of subgrps)
def pass6():
    global lno
    fin = open('x5.txt','r')
    fout = open('x6.txt','w')
    curSect = []
    while True:
        li = fin.readline()
        lno += 1
        if len(li)==0 or li.startswith('!startSection'):
            parseSect(fout,curSect)
            if len(li)==0:
                break
            curSect = [li.rstrip()]
            continue
        curSect.append(li.rstrip())
    fin.close()
    fout.close()
    print 'pass6 fini'
    
# cleanup title for xml section; also used for notes
def cleanText(src):
    src = re.sub(r'\s+',' ',src)
    src = re.sub(r', \&c','',src)
    src = re.sub(r'\&c','',src)
    src = src.replace('\&c','')
    src = re.sub(r'\{[\w|\s|,|-]*\}','',src)
    return src.strip()

def writeXml():
    global lno
    fin = open('x6.txt','r')
    fout = open('roget.xml','w')
    fout.write('<?xml version="1.0"?>\n')
    fout.write('<roget>\n')
    curPos = ''
    while True:
        li = fin.readline()
        lno += 1
        if len(li)==0:
            break
        li = li.strip()
        if li.startswith('!startSection'):
            [dummy,curSectId,title,note] = li.split(':')
            attrs = []
            attrs.append('id=\"%s"' % curSectId)
            title = cleanText(title)
            note = cleanText(note)
            if len(title) > 0:
                attrs.append('title=\"%s"' % title)
            if len(note) > 0:
                attrs.append('note=\"%s"' % note)
            attrs = ' '.join(attrs)
            fout.write('<section %s>\n' % attrs)
            curPos = ''
            continue
        if li.startswith('!endSection'):
            fout.write('</%s>\n' % curPos)
            [dummy,curSectId] = li.split(':')
            fout.write('</section>\n')
            continue
        [_id,content] = li.split(':')
        pos = _id.split('.')[1]
        if pos != curPos:
            if len(curPos) > 0:
                fout.write('</%s>\n' % curPos)
            curPos = pos
            fout.write('<%s>\n' % curPos)
        content = content.replace('^',':')
        fout.write('<g id="%s">\n' % _id)
        for lix in content.split(';'):
            lix = lix.strip()
            if len(lix) > 0:
                fout.write('  %s;\n' % lix)
        fout.write('</g>\n')    
    fout.write('</roget>\n')
    fin.close()
    fout.close()
    print 'wrote "roget.xml"'

    
if __name__== "__main__":
    pass1()
    pass2()
    pass3()
    pass4()
    pass5()
    pass6()
    writeXml()

   
